Critter Detector
Below is a video from a camera trap set in New Zealand for which we wish to detect the species of critter in the video
from IPython.display import Video
Video('vid0027.mp4', embed=True)
We load the required modules, this notebook requires tensorflow==2.2 and openCV_python==2
import argparse
import glob
import os
import statistics
import sys
import time
import warnings
import cv2
import json
from model import efficientdet
from utils import preprocess_image, postprocess_boxes
import humanfriendly
import numpy as np
from tqdm import tqdm
#from ct_utils import truncate_float
#import visualization.visualization_utils as viz_utils
The following block contain the code used to load the model and generate the inference
# ignoring all "PIL cannot read EXIF metainfo for the images" warnings
warnings.filterwarnings('ignore', '(Possibly )?corrupt EXIF data', UserWarning)
# Metadata Warning, tag 256 had too many entries: 42, expected 1
warnings.filterwarnings('ignore', 'Metadata warning', UserWarning)
# Numpy FutureWarnings from tensorflow import
warnings.filterwarnings('ignore', category=FutureWarning)
from tensorflow import keras
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU
config.log_device_placement = True # to log device placement (on which device the operation ran)
sess = tf.compat.v1.Session(config=config)
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
#tf.Session(config=config)
print('TensorFlow version:', tf.__version__)
print('Is GPU available? tf.test.is_gpu_available:', tf.test.is_gpu_available())
Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
TensorFlow version: 2.2.0
WARNING:tensorflow:From <ipython-input-3-fd84f7977e8b>:21: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Is GPU available? tf.test.is_gpu_available: False
class VideoPathUtils:
"""A collection of utility functions supporting this stand-alone script"""
# Stick this into filenames before the extension for the rendered result
DETECTION_FILENAME_INSERT = '_detections'
video_extensions = ['.avi', '.mov']
@staticmethod
def is_video_file(s):
"""
Check a file's extension against a hard-coded set of image file extensions '
"""
ext = os.path.splitext(s)[1]
return ext.lower() in VideoPathUtils.video_extensions
@staticmethod
def find_video_files(strings):
"""
Given a list of strings that are potentially image file names, look for strings
that actually look like image file names (based on extension).
"""
return [s for s in strings if VideoPathUtils.is_video_file(s)]
@staticmethod
def find_videos(dir_name, recursive=False):
"""
Find all files in a directory that look like image file names
"""
if recursive:
strings = glob.glob(os.path.join(dir_name, '**', '*.*'), recursive=True)
else:
strings = glob.glob(os.path.join(dir_name, '*.*'))
video_strings = VideoPathUtils.find_video_files(strings)
return video_strings
class TFDetector:
"""
A detector model loaded at the time of initialization. It is intended to be used with
the MegaDetector (TF). The inference batch size is set to 1; code needs to be modified
to support larger batch sizes, including resizing appropriately.
"""
# Number of decimal places to round to for confidence and bbox coordinates
CONF_DIGITS = 3
COORD_DIGITS = 4
# MegaDetector was trained with batch size of 1, and the resizing function is a part
# of the inference graph
BATCH_SIZE = 1
# An enumeration of failure reasons
FAILURE_TF_INFER = 'Failure TF inference'
FAILURE_IMAGE_OPEN = 'Failure image access'
DEFAULT_RENDERING_CONFIDENCE_THRESHOLD = 0.4 # to render bounding boxes
DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD = 0.2 # to include in the output json file
DEFAULT_DETECTOR_LABEL_MAP = {
'1': 'rat',
'2': 'possum',
'3': 'stoat',
'4': 'cat',
'5': 'bird',
'6': 'leaf' # will be available in megadetector v4
}
NUM_DETECTOR_CATEGORIES = 5 # animal, person, group, vehicle - for color assignment
def __init__(self, model_path,phi):
"""Loads the model at model_path and start a tf.Session with this graph. The necessary
input and output tensor handles are obtained also."""
model, detection_graph = efficientdet(phi, num_classes=6, num_anchors=9, weighted_bifpn=False, freeze_bn=False, detect_quadrangle=False )
detection_graph.load_weights(model_path,by_name=True,skip_mismatch=True)
self.model = detection_graph
image_sizes = (512, 640, 768, 896, 1024, 1280, 1408)
self.image_size = image_sizes[phi]
@staticmethod
def round_and_make_float(d, precision=4):
return truncate_float(float(d), precision=precision)
@staticmethod
def __convert_coords(np_array):
""" Two effects: convert the numpy floats to Python floats, and also change the coordinates from
[y1, x1, y2, x2] to [x1, y1, width_box, height_box] (in relative coordinates still).
Args:
np_array: array of predicted bounding box coordinates from the TF detector
Returns: array of predicted bounding box coordinates as Python floats and in [x1, y1, width_box, height_box]
"""
# change from [y1, x1, y2, x2] to [x1, y1, width_box, height_box]
width_box = np_array[2] - np_array[0]
height_box = np_array[3] - np_array[1]
new = [np_array[0], np_array[1], width_box, height_box] # cannot be a numpy array; needs to be a list
# convert numpy floats to Python floats
for i, d in enumerate(new):
new[i] = TFDetector.round_and_make_float(d, precision=TFDetector.COORD_DIGITS)
return new
def _generate_detections_one_image(self, image):
n_image = preprocess_image(image,self.image_size)
#n_image = preprocess.normalize_image(n_image)
n_image = tf.expand_dims(n_image[0],0)
box_tensor_out, score_tensor_out, class_tensor_out = self.model.predict(n_image)
return box_tensor_out, score_tensor_out, class_tensor_out
def generate_detections_one_image(self, image, image_id,
detection_threshold=DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD):
"""Apply the detector to an image.
Args:
image: the PIL Image object
image_id: a path to identify the image; will be in the `file` field of the output object
detection_threshold: confidence above which to include the detection proposal
Returns:
A dict with the following fields, see https://github.com/microsoft/CameraTraps/tree/siyu/inference_refactor/api/batch_processing#batch-processing-api-output-format
- image_id (always present)
- max_detection_conf
- detections, which is a list of detection objects containing `category`, `conf` and `bbox`
- failure
"""
result = {
'frame': image_id
}
try:
b_box, b_score, b_class = self._generate_detections_one_image(image)
# our batch size is 1; need to loop the batch dim if supporting batch size > 1
boxes, scores, classes = b_box[0], b_score[0], b_class[0]
detections_cur_image = [] # will be empty for an image with no confident detections
max_detection_conf = 0.0
for b, s, c in zip(boxes, scores, classes):
if s > detection_threshold:
bbox2=TFDetector.__convert_coords(b)
bbox2[0]=bbox2[0]/self.image_size
bbox2[1]=bbox2[1]/self.image_size*16/9
bbox2[2]=bbox2[2]/self.image_size
bbox2[3]=bbox2[3]/self.image_size*16/9
detection_entry = {
'category': str(int(c)), # use string type for the numerical class label, not int
'conf': truncate_float(float(s), # cast to float for json serialization
precision=TFDetector.CONF_DIGITS),
'bbox': bbox2
}
detections_cur_image.append(detection_entry)
if s > max_detection_conf:
max_detection_conf = s
result['max_detection_conf'] = truncate_float(float(max_detection_conf),
precision=TFDetector.CONF_DIGITS)
result['detections'] = detections_cur_image
except Exception as e:
result['failure'] = TFDetector.FAILURE_TF_INFER
print('TFDetector: image {} failed during inference: {}'.format(image_id, str(e)))
return result
#%% Main function
def load_and_run_detector(model_file, phi, video_file_names, output_dir,
render_confidence_threshold=TFDetector.DEFAULT_RENDERING_CONFIDENCE_THRESHOLD):
if len(video_file_names) == 0:
print('Warning: no files available')
return
# load and run detector on target images, and visualize the results
start_time = time.time()
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
try:
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
print(e)
tf_detector = TFDetector(model_file, phi)
elapsed = time.time() - start_time
print('Loaded model in {}'.format(humanfriendly.format_timespan(elapsed)))
time_load = []
time_infer = []
# since we'll be writing a bunch of files to the same folder, rename
# as necessary to avoid collisions
output_file_names = {}
for vid_file in tqdm(video_file_names):
try:
detection_results = []
vid_images = cv2.VideoCapture(vid_file)
frame_width = int(vid_images.get(3))
frame_height = int(vid_images.get(4))
fn = os.path.basename(vid_file).lower()
name, ext = os.path.splitext(fn)
fn = '{}{}{}'.format(name, VideoPathUtils.DETECTION_FILENAME_INSERT, '.avi') # save all as AVI
if fn in output_file_names:
n_collisions = output_file_names[fn] # if there were a collision, the count is at least 1
fn = str(n_collisions) + '_' + fn
output_file_names[fn] = n_collisions + 1
else:
output_file_names[fn] = 0
output_full_path = os.path.join(output_dir, fn)
fname,ext=os.path.splitext(fn)
fjson='{}{}'.format(fname,'.json')
json_full_path = os.path.join(output_dir, fjson)
result = {
'file': vid_file,
'width': frame_width,
'height': frame_height
}
detection_results.append(result)
out = cv2.VideoWriter(output_full_path,cv2.VideoWriter_fourcc('M','J','P','G'), 10, (frame_width,frame_height))
frame_no=0
while(vid_images.isOpened()):
frame_no=frame_no+1
start_time = time.time()
ret, image = vid_images.read()
if ret == True:
elapsed = time.time() - start_time
time_load.append(elapsed)
else:
break
start_time = time.time()
result = tf_detector.generate_detections_one_image(image, frame_no)
detection_results.append(result)
elapsed = time.time() - start_time
time_infer.append(elapsed)
cv2.imwrite("tmp.jpg",image)
tmp_image_file=viz_utils.load_image("tmp.jpg")
viz_utils.render_detection_bounding_boxes(result['detections'], tmp_image_file,
label_map=TFDetector.DEFAULT_DETECTOR_LABEL_MAP,
confidence_threshold=render_confidence_threshold)
out.write(np.asarray(tmp_image_file))
with open(json_full_path, 'w') as jsonoutfile:
json.dump(detection_results,jsonoutfile)
out.release()
vid_images.release()
except Exception as e:
print('Video {} cannot be loaded. Exception: {}'.format(vid_file, e))
result = {
'file': vid_file,
'failure': TFDetector.FAILURE_IMAGE_OPEN
}
detection_results.append(result)
continue
The following block runs the code, add additional files to video_file_names to include them in the inference
video_file_names=[]
video_file_names.append('vid0027.AVI')
load_and_run_detector(model_file='d3_updated.h5',
phi=3,
video_file_names=video_file_names,
output_dir='./',
render_confidence_threshold=0.6)
WARNING:tensorflow:Layer boxes is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2. The layer has dtype float32 because it's dtype defaults to floatx.
If you intended to run this layer in float32, you can safely ignore this warning. If in doubt, this warning is likely only an issue if you are porting a TensorFlow 1.X model to TensorFlow 2.
To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.
100%|██████████| 1/1 [00:00<00:00, 6.05it/s]
Loaded model in 10.58 seconds
We then visualize the annotated video of the detection in the notebook
Video('vid0027_detections.mp4', embed=True)